{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5.3 Lab: Cross-Validation and the Bootstrap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.model_selection import train_test_split,cross_val_score\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.3.1 The Validation Set Approach"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(397, 9)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mpg</th>\n",
       "      <th>cylinders</th>\n",
       "      <th>displacement</th>\n",
       "      <th>horsepower</th>\n",
       "      <th>weight</th>\n",
       "      <th>acceleration</th>\n",
       "      <th>year</th>\n",
       "      <th>origin</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>18.0</td>\n",
       "      <td>8</td>\n",
       "      <td>307.0</td>\n",
       "      <td>130</td>\n",
       "      <td>3504</td>\n",
       "      <td>12.0</td>\n",
       "      <td>70</td>\n",
       "      <td>1</td>\n",
       "      <td>chevrolet chevelle malibu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>15.0</td>\n",
       "      <td>8</td>\n",
       "      <td>350.0</td>\n",
       "      <td>165</td>\n",
       "      <td>3693</td>\n",
       "      <td>11.5</td>\n",
       "      <td>70</td>\n",
       "      <td>1</td>\n",
       "      <td>buick skylark 320</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>18.0</td>\n",
       "      <td>8</td>\n",
       "      <td>318.0</td>\n",
       "      <td>150</td>\n",
       "      <td>3436</td>\n",
       "      <td>11.0</td>\n",
       "      <td>70</td>\n",
       "      <td>1</td>\n",
       "      <td>plymouth satellite</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>16.0</td>\n",
       "      <td>8</td>\n",
       "      <td>304.0</td>\n",
       "      <td>150</td>\n",
       "      <td>3433</td>\n",
       "      <td>12.0</td>\n",
       "      <td>70</td>\n",
       "      <td>1</td>\n",
       "      <td>amc rebel sst</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>17.0</td>\n",
       "      <td>8</td>\n",
       "      <td>302.0</td>\n",
       "      <td>140</td>\n",
       "      <td>3449</td>\n",
       "      <td>10.5</td>\n",
       "      <td>70</td>\n",
       "      <td>1</td>\n",
       "      <td>ford torino</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    mpg  cylinders  displacement horsepower  weight  acceleration  year  \\\n",
       "0  18.0          8         307.0        130    3504          12.0    70   \n",
       "1  15.0          8         350.0        165    3693          11.5    70   \n",
       "2  18.0          8         318.0        150    3436          11.0    70   \n",
       "3  16.0          8         304.0        150    3433          12.0    70   \n",
       "4  17.0          8         302.0        140    3449          10.5    70   \n",
       "\n",
       "   origin                       name  \n",
       "0       1  chevrolet chevelle malibu  \n",
       "1       1          buick skylark 320  \n",
       "2       1         plymouth satellite  \n",
       "3       1              amc rebel sst  \n",
       "4       1                ford torino  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv(r'E:\\programming\\dataset\\Into_to_statstical_learning\\Auto.csv')\n",
    "print(data.shape)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "mpg             float64\n",
       "cylinders         int64\n",
       "displacement    float64\n",
       "horsepower       object\n",
       "weight            int64\n",
       "acceleration    float64\n",
       "year              int64\n",
       "origin            int64\n",
       "name             object\n",
       "dtype: object"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Since we can see the data type of horsepower is object, when it should be numrical\n",
    "data.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['130', '165', '150', '140', '198', '220', '215', '225', '190',\n",
       "       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',\n",
       "       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',\n",
       "       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',\n",
       "       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',\n",
       "       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',\n",
       "       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',\n",
       "       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',\n",
       "       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',\n",
       "       '64', '74', '116', '82'], dtype=object)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#by exploring the values of horsepower, it has a \"?\", due to which the while column is being considered of object datatype\n",
    "data['horsepower'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# removing the points which has '?' value of horsepower, we will first replace it by np.nan\n",
    "data['horsepower'] = data['horsepower'].replace('?',np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# than drop all the points which have np.nan\n",
    "data = data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# since now, all the points are numeric, change the datatype to int\n",
    "data['horsepower'] = data['horsepower'].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(196,) (196,)\n"
     ]
    }
   ],
   "source": [
    "# for the first part we need to divide the set into two parts\n",
    "# test_size = 0.5, ie, both the sets have similar number of observations, 196\n",
    "X_train,X_test,y_train,y_test = train_test_split(data['horsepower'],data['mpg'],test_size = 0.5,random_state = 0)\n",
    "print(X_train.shape,X_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean squared error is  23.616617069669882\n"
     ]
    }
   ],
   "source": [
    "lr = LinearRegression()\n",
    "lr.fit(X_train.to_frame(),y_train)\n",
    "# since X_test, is a single column, it will be considered as a series, we need to change it to dataframe\n",
    "pred = lr.predict(X_test.to_frame())\n",
    "print('Mean squared error is ',mean_squared_error(y_test,pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean squared error for quadratic is  18.76303134689764\n"
     ]
    }
   ],
   "source": [
    "# quadratic polynomial\n",
    "lr = LinearRegression()\n",
    "# include bias = False, will not return the constant value, that is X**0, as that is added automatically by lin regression\n",
    "poly = PolynomialFeatures(2,include_bias=False)\n",
    "poly.fit(X_train.to_frame())\n",
    "X_train_quad = poly.transform(X_train.to_frame())\n",
    "X_test_quad = poly.transform(X_test.to_frame())\n",
    "\n",
    "lr.fit(X_train_quad,y_train)\n",
    "pred = lr.predict(X_test_quad)\n",
    "\n",
    "print('Mean squared error for quadratic is ',mean_squared_error(y_test,pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean squared error for cubic is  18.79694163263048\n"
     ]
    }
   ],
   "source": [
    "#cubic polynomial\n",
    "lr = LinearRegression()\n",
    "poly = PolynomialFeatures(3,include_bias=False)\n",
    "poly.fit(X_train.to_frame())\n",
    "X_train_cubic = poly.transform(X_train.to_frame())\n",
    "X_test_cubic = poly.transform(X_test.to_frame())\n",
    "\n",
    "lr.fit(X_train_cubic,y_train)\n",
    "pred = lr.predict(X_test_cubic)\n",
    "\n",
    "print('Mean squared error for cubic is ',mean_squared_error(y_test,pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(196,) (196,)\n",
      "Mean squared error is  24.80212062059356\n",
      "Mean squared error for quadratic is  18.848292603275663\n",
      "Mean squared error for cubic is  18.805111358604574\n"
     ]
    }
   ],
   "source": [
    "# with different random state\n",
    "# earlier it was random state 0, now its 1\n",
    "X_train,X_test,y_train,y_test = train_test_split(data['horsepower'],data['mpg'],test_size = 0.5,random_state = 1)\n",
    "print(X_train.shape,X_test.shape)\n",
    "\n",
    "lr = LinearRegression()\n",
    "lr.fit(X_train.to_frame(),y_train)\n",
    "pred = lr.predict(X_test.to_frame())\n",
    "print('Mean squared error is ',mean_squared_error(y_test,pred))\n",
    "\n",
    "# quadratic polynomial\n",
    "lr = LinearRegression()\n",
    "poly = PolynomialFeatures(2,include_bias=False)\n",
    "poly.fit(X_train.to_frame())\n",
    "X_train_quad = poly.transform(X_train.to_frame())\n",
    "X_test_quad = poly.transform(X_test.to_frame())\n",
    "\n",
    "lr.fit(X_train_quad,y_train)\n",
    "pred = lr.predict(X_test_quad)\n",
    "\n",
    "print('Mean squared error for quadratic is ',mean_squared_error(y_test,pred))\n",
    "\n",
    "#cubic polynomial\n",
    "lr = LinearRegression()\n",
    "poly = PolynomialFeatures(3,include_bias = False)\n",
    "poly.fit(X_train.to_frame())\n",
    "X_train_cubic = poly.transform(X_train.to_frame())\n",
    "X_test_cubic = poly.transform(X_test.to_frame())\n",
    "\n",
    "lr.fit(X_train_cubic,y_train)\n",
    "pred = lr.predict(X_test_cubic)\n",
    "\n",
    "print('Mean squared error for cubic is ',mean_squared_error(y_test,pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.3.2 Leave-One-Out Cross-Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "error_list = []\n",
    "for power in range(1,6):\n",
    "    X = data['horsepower']\n",
    "    y = data['mpg']\n",
    "    poly = PolynomialFeatures(power,include_bias=False)\n",
    "    X = poly.fit_transform(X.to_frame())\n",
    "    \n",
    "    lr = LinearRegression()\n",
    "    # for LOOCV, the number of folds be will n = size of data\n",
    "    error_list.append(-1*cross_val_score(lr,X,y,cv = len(X),scoring = 'neg_mean_squared_error').mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>DEGREE</th>\n",
       "      <th>MEAN SQUARED ERROR</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>24.231514</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>19.248213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>19.334984</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>19.424430</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>19.033213</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   DEGREE  MEAN SQUARED ERROR\n",
       "0       1           24.231514\n",
       "1       2           19.248213\n",
       "2       3           19.334984\n",
       "3       4           19.424430\n",
       "4       5           19.033213"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame({\"DEGREE\":np.arange(1,6),\"MEAN SQUARED ERROR\":error_list})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.3.3 k-Fold Cross-Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K FOLD CV\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>DEGREE</th>\n",
       "      <th>MEAN SQUARED ERROR</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>27.439934</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>21.235840</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>21.336606</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>21.353887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>20.905641</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>20.780115</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>20.968094</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>21.077654</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>21.037602</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>20.973329</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   DEGREE  MEAN SQUARED ERROR\n",
       "0       1           27.439934\n",
       "1       2           21.235840\n",
       "2       3           21.336606\n",
       "3       4           21.353887\n",
       "4       5           20.905641\n",
       "5       6           20.780115\n",
       "6       7           20.968094\n",
       "7       8           21.077654\n",
       "8       9           21.037602\n",
       "9      10           20.973329"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "error_list = []\n",
    "for power in range(1,11):\n",
    "    X = data['horsepower']\n",
    "    y = data['mpg']\n",
    "    poly = PolynomialFeatures(power,include_bias=False)\n",
    "    X = poly.fit_transform(X.to_frame())\n",
    "    \n",
    "    lr = LinearRegression()\n",
    "    error_list.append(-1*cross_val_score(lr,X,y,cv = 10,scoring = 'neg_mean_squared_error').mean())\n",
    "\n",
    "print('K FOLD CV')    \n",
    "pd.DataFrame({\"DEGREE\":np.arange(1,11),\"MEAN SQUARED ERROR\":error_list})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.3.4 The Bootstrap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(100, 2)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>X</th>\n",
       "      <th>Y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.895251</td>\n",
       "      <td>-0.234924</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-1.562454</td>\n",
       "      <td>-0.885176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.417090</td>\n",
       "      <td>0.271888</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.044356</td>\n",
       "      <td>-0.734198</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>-0.315568</td>\n",
       "      <td>0.841983</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          X         Y\n",
       "1 -0.895251 -0.234924\n",
       "2 -1.562454 -0.885176\n",
       "3 -0.417090  0.271888\n",
       "4  1.044356 -0.734198\n",
       "5 -0.315568  0.841983"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "portfolio = pd.read_csv(r'E:\\programming\\dataset\\Into_to_statstical_learning\\Portfolio.csv',index_col=0)\n",
    "print(portfolio.shape)\n",
    "portfolio.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we first define a function equivalent to func alpha defined in the Lab, i would recommend you to go through the function in lab\n",
    "# This function takes two arguements, data and indeces, this indices are used to calculate the estimate for alpha for this bootstrap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "def alpha(data,index):\n",
    "    X = data['X'].loc[index]\n",
    "    y = data['Y'].loc[index]\n",
    "    \n",
    "    return (np.var(y) - np.cov(X,y)[0][1]) / (np.var(X) + np.var(y) - 2*(np.cov(X,y)[0][1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "# equivalent to sample function in lab\n",
    "def get_indices(data,num_samples):\n",
    "    return  np.random.choice(data.index, num_samples, replace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5771059047399711"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "alpha(portfolio,np.arange(1,100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6491078066222179"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(2)\n",
    "alpha(portfolio,get_indices(portfolio,100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "# there is no built in function like boot, so, we will define one\n",
    "def boot(data,func,R):\n",
    "    estimates = []\n",
    "    for i in range(R):\n",
    "        estimates.append(func(data,get_indices(data,100)))\n",
    "    bootstrap_statistics = {'estimated_value':np.mean(estimates),'std_error':np.std(estimates)}   \n",
    "    return bootstrap_statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'estimated_value': 0.580591309592219, 'std_error': 0.08980409161302537}"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(0)\n",
    "results = boot(portfolio,alpha,1000)\n",
    "results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Estimating the Accuracy of a Linear Regression Model\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>mpg</th>\n",
       "      <th>cylinders</th>\n",
       "      <th>displacement</th>\n",
       "      <th>horsepower</th>\n",
       "      <th>weight</th>\n",
       "      <th>acceleration</th>\n",
       "      <th>year</th>\n",
       "      <th>origin</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>18.0</td>\n",
       "      <td>8</td>\n",
       "      <td>307.0</td>\n",
       "      <td>130</td>\n",
       "      <td>3504</td>\n",
       "      <td>12.0</td>\n",
       "      <td>70</td>\n",
       "      <td>1</td>\n",
       "      <td>chevrolet chevelle malibu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>15.0</td>\n",
       "      <td>8</td>\n",
       "      <td>350.0</td>\n",
       "      <td>165</td>\n",
       "      <td>3693</td>\n",
       "      <td>11.5</td>\n",
       "      <td>70</td>\n",
       "      <td>1</td>\n",
       "      <td>buick skylark 320</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>18.0</td>\n",
       "      <td>8</td>\n",
       "      <td>318.0</td>\n",
       "      <td>150</td>\n",
       "      <td>3436</td>\n",
       "      <td>11.0</td>\n",
       "      <td>70</td>\n",
       "      <td>1</td>\n",
       "      <td>plymouth satellite</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>16.0</td>\n",
       "      <td>8</td>\n",
       "      <td>304.0</td>\n",
       "      <td>150</td>\n",
       "      <td>3433</td>\n",
       "      <td>12.0</td>\n",
       "      <td>70</td>\n",
       "      <td>1</td>\n",
       "      <td>amc rebel sst</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>17.0</td>\n",
       "      <td>8</td>\n",
       "      <td>302.0</td>\n",
       "      <td>140</td>\n",
       "      <td>3449</td>\n",
       "      <td>10.5</td>\n",
       "      <td>70</td>\n",
       "      <td>1</td>\n",
       "      <td>ford torino</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   index   mpg  cylinders  displacement  horsepower  weight  acceleration  \\\n",
       "0      0  18.0          8         307.0         130    3504          12.0   \n",
       "1      1  15.0          8         350.0         165    3693          11.5   \n",
       "2      2  18.0          8         318.0         150    3436          11.0   \n",
       "3      3  16.0          8         304.0         150    3433          12.0   \n",
       "4      4  17.0          8         302.0         140    3449          10.5   \n",
       "\n",
       "   year  origin                       name  \n",
       "0    70       1  chevrolet chevelle malibu  \n",
       "1    70       1          buick skylark 320  \n",
       "2    70       1         plymouth satellite  \n",
       "3    70       1              amc rebel sst  \n",
       "4    70       1                ford torino  "
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# auto data used earlier in the notebook\n",
    "data = data.reset_index()\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [],
   "source": [
    "# similar to boot.fn in lab\n",
    "\n",
    "def get_estimates(data,index):\n",
    "    X = data['horsepower'].loc[index]\n",
    "    y = data['mpg'].loc[index]\n",
    "    \n",
    "    lr = LinearRegression()\n",
    "    lr.fit(X.to_frame(),y)\n",
    "    intercept = lr.intercept_\n",
    "    coef = lr.coef_\n",
    "    return [intercept,coef]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[39.93586102117047, array([-0.15784473])]"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_estimates(data,np.arange(0,392))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [],
   "source": [
    "#modifying the boot mentioned that we used earlier\n",
    "def boot(data,func,R):\n",
    "    intercept = []\n",
    "    coeff = []\n",
    "    for i in range(R):\n",
    "        intercept.append(func(data,get_indices(data,100))[0])\n",
    "        coeff.append(func(data,get_indices(data,100))[1]) \n",
    "    intercept_statistics = {'estimated_value':np.mean(intercept),'std_error':np.std(intercept)}   \n",
    "    coeff_statistices = {'estimated_value':np.mean(coeff),'std_error':np.std(coeff)}   \n",
    "    return {'intercept':intercept_statistics,'coeff_statistices':coeff_statistices}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = boot(data,get_estimates,1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Result for intercept  {'estimated_value': 39.971231978520734, 'std_error': 1.7370193344068778}\n",
      "Result for coefficient term  {'estimated_value': -0.15881709682801645, 'std_error': 0.014778176069058804}\n"
     ]
    }
   ],
   "source": [
    "print('Result for intercept ',results['intercept'])\n",
    "print('Result for coefficient term ',results['coeff_statistices'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for bootstraping we have std error 1.69 for intercept, and 0.0144 for ceoff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                    mpg   R-squared:                       0.606\n",
      "Model:                            OLS   Adj. R-squared:                  0.605\n",
      "Method:                 Least Squares   F-statistic:                     599.7\n",
      "Date:                Wed, 08 Jul 2020   Prob (F-statistic):           7.03e-81\n",
      "Time:                        11:03:19   Log-Likelihood:                -1178.7\n",
      "No. Observations:                 392   AIC:                             2361.\n",
      "Df Residuals:                     390   BIC:                             2369.\n",
      "Df Model:                           1                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "const         39.9359      0.717     55.660      0.000      38.525      41.347\n",
      "horsepower    -0.1578      0.006    -24.489      0.000      -0.171      -0.145\n",
      "==============================================================================\n",
      "Omnibus:                       16.432   Durbin-Watson:                   0.920\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               17.305\n",
      "Skew:                           0.492   Prob(JB):                     0.000175\n",
      "Kurtosis:                       3.299   Cond. No.                         322.\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
     ]
    }
   ],
   "source": [
    "# for lets see what the model predicts\n",
    "import statsmodels.api as sm\n",
    "X = data['horsepower']\n",
    "y = data['mpg']\n",
    "\n",
    "X = sm.add_constant(X)\n",
    "results = sm.OLS(y,X).fit()\n",
    "print(results.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "# standard error are less for estimations using model\n",
    "# But still bootstrap estimates are mode preices, because they don't rely on assumptions, while there is a lot of assumptions\n",
    "# when calculating std errors using sm(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Result for intercept  {'estimated_value': 57.22194337269758, 'std_error': 4.268750160472158}\n",
      "Result for coefficient term horsepower {'estimated_value': -0.47175109701024376, 'std_error': 0.07214757143694592}\n",
      "Result for coefficient term horsepower**2 {'estimated_value': 0.0012511071230436879, 'std_error': 0.00026005778901839197}\n",
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                    mpg   R-squared:                       0.688\n",
      "Model:                            OLS   Adj. R-squared:                  0.686\n",
      "Method:                 Least Squares   F-statistic:                     428.0\n",
      "Date:                Wed, 08 Jul 2020   Prob (F-statistic):           5.40e-99\n",
      "Time:                        11:20:43   Log-Likelihood:                -1133.2\n",
      "No. Observations:                 392   AIC:                             2272.\n",
      "Df Residuals:                     389   BIC:                             2284.\n",
      "Df Model:                           2                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "================================================================================\n",
      "                   coef    std err          t      P>|t|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------\n",
      "const           56.9001      1.800     31.604      0.000      53.360      60.440\n",
      "horsepower      -0.4662      0.031    -14.978      0.000      -0.527      -0.405\n",
      "horsepower_2     0.0012      0.000     10.080      0.000       0.001       0.001\n",
      "==============================================================================\n",
      "Omnibus:                       16.158   Durbin-Watson:                   1.078\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               30.662\n",
      "Skew:                           0.218   Prob(JB):                     2.20e-07\n",
      "Kurtosis:                       4.299   Cond. No.                     1.29e+05\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 1.29e+05. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Lenovo\\Anaconda3\\lib\\site-packages\\numpy\\core\\fromnumeric.py:2542: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.\n",
      "  return ptp(axis=axis, out=out, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "# Adding a quad term\n",
    "# similar to boot.fn in lab\n",
    "data['horsepower_2'] = data['horsepower']**2\n",
    "\n",
    "def get_estimates(data,index):\n",
    "    X = data[['horsepower','horsepower_2']].loc[index]\n",
    "    y = data['mpg'].loc[index]\n",
    "    \n",
    "    lr = LinearRegression()\n",
    "    lr.fit(X,y)\n",
    "    intercept = lr.intercept_\n",
    "    coef = lr.coef_\n",
    "    return [intercept,coef]\n",
    "\n",
    "get_estimates(data,np.arange(0,392))\n",
    "\n",
    "#modifying the boot mentioned that we used earlier\n",
    "def boot(data,func,R):\n",
    "    intercept = []\n",
    "    coeff_1 = []\n",
    "    coeff_2 = []\n",
    "    for i in range(R):\n",
    "        intercept.append(func(data,get_indices(data,100))[0])\n",
    "        coeff_1.append(func(data,get_indices(data,100))[1][0]) \n",
    "        coeff_2.append(func(data,get_indices(data,100))[1][1])\n",
    "    intercept_statistics = {'estimated_value':np.mean(intercept),'std_error':np.std(intercept)}   \n",
    "    coeff_1_statistices = {'estimated_value':np.mean(coeff_1),'std_error':np.std(coeff_1)}   \n",
    "    coeff_2_statistices = {'estimated_value':np.mean(coeff_2),'std_error':np.std(coeff_2)}   \n",
    "    return {'intercept':intercept_statistics,'coeff_1_statistices':coeff_1_statistices,'coeff_2_statistics':coeff_2_statistices}\n",
    "\n",
    "results = boot(data,get_estimates,1000)\n",
    "\n",
    "print('Result for intercept ',results['intercept'])\n",
    "print('Result for coefficient term horsepower',results['coeff_1_statistices'])\n",
    "print('Result for coefficient term horsepower**2',results['coeff_2_statistics'])\n",
    "\n",
    "\n",
    "# for bootstraping we have std error 1.69 for intercept, and 0.0144 for ceoff\n",
    "\n",
    "# for lets see what the model predicts\n",
    "import statsmodels.api as sm\n",
    "X = data[['horsepower','horsepower_2']]\n",
    "y = data['mpg']\n",
    "\n",
    "X = sm.add_constant(X)\n",
    "results = sm.OLS(y,X).fit()\n",
    "print(results.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
